# load packages, installing if missing
# if (!require(librarian)){
#   install.packages("librarian")
#   library(librarian)
#}
librarian::shelf(
  dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE)

# set random seed for reproducibility
set.seed(42)

# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = T)

Overview

This machine learning analysis was completed as an assignment for my Master’s program course, Environmental Data Science 232: Machine Learning. It was assigned by our professor, Dr. Ben Best, as an introduction to machine learning by predicting presence of a chosen species from observations and environmental data found on the site iNaturalist. It follows guidance found at Species distribution modeling | R Spatial.

My chosen species is coyote brush (Baccharis pilularis). Baccharis pilularis is native to the west coast of the United States (Oregon, California, and Baja California, Mexico). It is a shurb in the Asteraceae (Sunflower) family with oblanceolate to obovate toothed leaves, panicle-like inflorescence with staminate flowers that when mature mimic snow, and generally sticky (not a pun) (Bogler 2012).

Baccharis pilularis Image Credit: CalScape

Explore

The first step in this machine learning excercise is to download observation data of Baccharis pilularis from the iNaturalist site.

Aquire species observations

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo    <- FALSE

if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
    query = 'Baccharis pilularis', 
    from = 'gbif', 
    has_coords = T,
    limit = 10000))
  
  # extract data frame from result
  df <- res$gbif$data[[1]] 
  readr::write_csv(df, obs_csv)
  
  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) # save space (joinable from obs_csv)
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 10000
# show points on map
# mapview::mapview(obs, map.types = "CartoDB.Voyager")
  • Question 1. How many observations total are in GBIF for your species?

There are 10000` observations for Baccharis pilularis in this data. According to the iNaturalist site, over 19,000 observations have been uploaded of this species.

  • Question 2. Do you see any odd observations, like marine species on land or vice versa?

There are no observably inaccurate data points for this species.

Aquire environmental data

The next step is to use the Species Distribution Model predictors R package sdmpredictors to get underlying environmental data for Baccharis pilularis observations.

Environmental data
dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

Region of Interest

The environmental data is on a global scale. Here we crop the environmental rasters to a region of interest around the distribution of Baccharis pilularis.

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

In the end this table is the data that feeds into our species distribution model (y ~ X), where:

  • y is the present column with values of 1 (present) or 0 (absent)
  • X is all other columns: WC_alt, WC_bio1, WC_bio2, ER_tri, ER_topoWet, lon, lat

Term Plots

In the vein of exploratory data analyses, before going into modeling let’s look at the data. Specifically, let’s look at how obviously differentiated is the presence versus absence for each predictor – a more pronounced presence peak should make for a more confident model. A plot for a specific predictor and response is called a “term plot.” In this case we’ll look for predictors where the presence (present = 1) occupies a distinct “niche” from the background absence points (present = 0).

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  labs(title = "Baccharis pilularis Term Plots") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))
## Warning: Removed 191 rows containing non-finite values (stat_density).

References

Bogler, David. 2012. “Baccharis Pilularis.” Jepson Flora Project (Eds.) Jepson eFlora. https://ucjeps.berkeley.edu/eflora/eflora_display.php?tid=1603.